Industrial safety. NLP based Chatbot.
The database comes from one of the biggest industry in Brazil and in the world. It is an urgent need for industries/companies around the globe to understand why employees still suffer some injuries/accidents in plants. Sometimes they also die in such environment.
This The database is basically records of accidents from 12 different plants in 03 different countries which every line in the data is an occurrence of an accident.
Link to download the dataset: https://drive.google.com/file/d/1_GmrRP1S2OIa02KlfOBNkYa8uxazGbfE/view?usp=sharing, Original dataset link: https://www.kaggle.com/ihmstefanini/industrial-safety-and-health-analytics-database
Design a ML/DL based chatbot utility which can help the professionals to highlight the safety risk as per the incident description.
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
Replace the folder path with your drive folder path and clone the githb repo
%cd /content/drive/MyDrive/AIML/Capstone
/content/drive/MyDrive/AIML/Capstone
Run the below code for clone the repo in our Googele drive for first time
! git clone https://github.com/shyamsparrow/Jan-G4---NLP-Chatbot.git
fatal: destination path 'Jan-G4---NLP-Chatbot' already exists and is not an empty directory.
Change the Current folder into Github repo main folder, all the functions we are calling has to be in this repo.
%cd /content/drive/MyDrive/AIML/Capstone/Jan-G4---NLP-Chatbot
/content/drive/MyDrive/AIML/Capstone/Jan-G4---NLP-Chatbot
!git checkout ALL_Models
Checking out files: 100% (30/30), done. Switched to branch 'ALL_Models' Your branch is behind 'origin/ALL_Models' by 12 commits, and can be fast-forwarded. (use "git pull" to update your local branch)
!git status
On branch ALL_Models Your branch is behind 'origin/ALL_Models' by 12 commits, and can be fast-forwarded. (use "git pull" to update your local branch) Untracked files: (use "git add <file>..." to include in what will be committed) Multiclass ROC.png Transformer_aug_model.h5 Utilities/__pycache__/ cache_dir/ distilbert-New/ distilbert/ ngrok ngrok-stable-linux-amd64.tgz runs/ wandb/ nothing added to commit but untracked files present (use "git add" to track)
!git pull
Already up to date.
Run this code to update the changes in repo to our drive
import warnings
warnings.filterwarnings('ignore')
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import holoviews as hv
from holoviews import opts
import nltk
from nltk.util import ngrams
nltk.download('vader_lexicon')
[nltk_data] Downloading package vader_lexicon to /root/nltk_data... [nltk_data] Package vader_lexicon is already up-to-date!
True
industry_df = pd.read_csv("industry_df_preprocessed.csv")
def ngram_func(ngram, trg='', trg_value=''):
#trg_value is list-object
if (trg == '') or (trg_value == ''):
string_filterd = industry_df['Description_preprocessed'].sum().split()
else:
string_filterd = industry_df[industry_df[trg].isin(trg_value)]['Description_preprocessed'].sum().split()
dic = nltk.FreqDist(nltk.ngrams(string_filterd, ngram)).most_common(30)
ngram_df = pd.DataFrame(dic, columns=['ngram','count'])
ngram_df.index = [' '.join(i) for i in ngram_df.ngram]
ngram_df.drop('ngram',axis=1, inplace=True)
return ngram_df
hv.extension('bokeh')
hv.Bars(ngram_func(1)[::-1]).opts(title="Unigram Count top-30", color="red", xlabel="Unigrams", ylabel="Count")\
.opts(opts.Bars(width=600, height=600,tools=['hover'],show_grid=True,invert_axes=True))
hv.extension('bokeh')
hv.Bars(ngram_func(2)[::-1]).opts(title="Bigram Count top-30", color="yellow", xlabel="Bigrams", ylabel="Count")\
.opts(opts.Bars(width=600, height=600,tools=['hover'],show_grid=True,invert_axes=True))
hv.extension('bokeh')
hv.Bars(ngram_func(3)[::-1]).opts(title="Trigram Count top-30", color="blue", xlabel="Trigrams", ylabel="Count")\
.opts(opts.Bars(width=600, height=600,tools=['hover'],show_grid=True,invert_axes=True))
industry_df = pd.read_csv("industry_df_with_stopwords.csv")
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, precision_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc
from scipy import interp
import pickle
# Deep learning libraries
import tensorflow as tf
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, GlobalMaxPool1D
from keras.utils import np_utils
from keras.layers.merge import Concatenate
#from keras.utils import plot_model
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras.models import model_from_json
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
# Keras pre-processing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Select input and output features
X_text = industry_df['Description_preprocessed']
y_text = industry_df['Potential_Accident_Level']
# Encode labels in column 'Potential Accident Level'and Convert into one-hot encoded vectors:
le = LabelEncoder()
le.fit(y_text)
y_text_le = le.transform(y_text)
# Divide our data into testing and training sets:
X_text_train, X_text_test, y_text_train, y_text_test = train_test_split(X_text, y_text_le, test_size = 0.20, random_state = 1, stratify=y_text_le)
print('X_text_train shape : ({0})'.format(X_text_train.shape))
print('y_text_train shape : ({0},)'.format(y_text_train.shape))
print('X_text_test shape : ({0})'.format(X_text_test.shape))
print('y_text_test shape : ({0},)'.format(y_text_test.shape))
X_text_train shape : ((328,)) y_text_train shape : ((328,),) X_text_test shape : ((83,)) y_text_test shape : ((83,),)
print(np.bincount(y_text_train))
print(np.bincount(y_text_test))
[ 34 76 85 110 23] [ 9 19 21 28 6]
y_text_train = np_utils.to_categorical(y_text_train)
y_text_test = np_utils.to_categorical(y_text_test)
# The first step in word embeddings is to convert the words into thier corresponding numeric indexes.
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_text_train)
X_text_train = tokenizer.texts_to_sequences(X_text_train)
X_text_test = tokenizer.texts_to_sequences(X_text_test)
industry_df['nb_words'] = industry_df['Description_preprocessed'].apply(lambda x: len(x.split(' ')))
print('Minimum number of words: {}'.format(industry_df['nb_words'].min()))
print('Maximum number of words: {}'.format(industry_df['nb_words'].max()))
Minimum number of words: 16 Maximum number of words: 183
vocab_size = len(tokenizer.word_index) + 1
print("vocab_size:", vocab_size)
maxlen = 200
X_text_train = pad_sequences(X_text_train, padding='post', maxlen=maxlen)
X_text_test = pad_sequences(X_text_test, padding='post', maxlen=maxlen)
vocab_size: 2865
industry_df['sentence_length'].value_counts()
X_text_train.shape
(328, 200)
# Glove Vector
print('Indexing word vectors.')
embeddings_dictionary = {}
f = open('glove.6B.200d.txt', encoding='utf-8')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_dictionary[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_dictionary))
pickle.dump(embeddings_dictionary, open('/content/drive/MyDrive/AIML/Capstone/embeddings.pickle', 'wb'))
infile = open('/content/drive/MyDrive/AIML/Capstone/embeddings.pickle','rb')
embeddings_dictionary = pickle.load(infile)
def build_vocab(texts):
sentences = texts.apply(lambda x: x.split()).values
vocab = {}
for sentence in sentences:
for word in sentence:
try:
vocab[word] += 1
except KeyError:
vocab[word] = 1
return vocab
len(vocab)
3449
import operator
def check_coverage(vocab, embeddings_index):
known_words = {}
unknown_words = {}
nb_known_words = 0
nb_unknown_words = 0
for word in vocab.keys():
try:
known_words[word] = embeddings_index[word]
nb_known_words += vocab[word]
except:
unknown_words[word] = vocab[word]
nb_unknown_words += vocab[word]
pass
print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
print('Found embeddings for {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]
return unknown_words
oov_glove = check_coverage(vocab, embeddings_dictionary)
Found embeddings for 93.97% of vocab Found embeddings for 98.93% of all text
embedding_size = 200
embedding_matrix = np.zeros((vocab_size, embedding_size))
for word, index in tokenizer.word_index.items():
embedding_vector = embeddings_dictionary.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
embedding_matrix.shape
(3446, 200)
base_model = Sequential()
base_model.add(Embedding(vocab_size, output_dim=200, input_length=200, trainable=True))
#LSTM
base_model.add(Bidirectional(LSTM(units=128 , recurrent_dropout = 0.5 , dropout = 0.5)))
base_model.add(Dense(5, activation='softmax'))
base_model.compile(optimizer=optimizers.Adam(lr = 0.001), loss='binary_crossentropy', metrics=['acc'])
WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
base_model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) (None, 200, 200) 573000
bidirectional (Bidirectiona (None, 256) 336896
l)
dense (Dense) (None, 5) 1285
=================================================================
Total params: 911,181
Trainable params: 911,181
Non-trainable params: 0
_________________________________________________________________
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.001, patience=5)
#callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)
# fit the keras model on the dataset
base_model_history = base_model.fit(X_text_train, y_text_train, epochs=10, batch_size=32, verbose=1, validation_data=(X_text_test, y_text_test), callbacks=[rlrp,callback])
Epoch 1/10 11/11 [==============================] - 37s 3s/step - loss: 0.5853 - acc: 0.3140 - val_loss: 0.4961 - val_acc: 0.3373 - lr: 0.0010 Epoch 2/10 11/11 [==============================] - 26s 2s/step - loss: 0.4736 - acc: 0.3354 - val_loss: 0.4751 - val_acc: 0.3373 - lr: 0.0010 Epoch 3/10 11/11 [==============================] - 26s 2s/step - loss: 0.4677 - acc: 0.3354 - val_loss: 0.4713 - val_acc: 0.3373 - lr: 0.0010 Epoch 4/10 11/11 [==============================] - 25s 2s/step - loss: 0.4617 - acc: 0.3567 - val_loss: 0.4709 - val_acc: 0.3373 - lr: 0.0010 Epoch 5/10 11/11 [==============================] - 25s 2s/step - loss: 0.4464 - acc: 0.3902 - val_loss: 0.4704 - val_acc: 0.3133 - lr: 0.0010 Epoch 6/10 11/11 [==============================] - 26s 2s/step - loss: 0.4014 - acc: 0.5396 - val_loss: 0.4903 - val_acc: 0.1928 - lr: 0.0010 Epoch 7/10 11/11 [==============================] - 25s 2s/step - loss: 0.3362 - acc: 0.7073 - val_loss: 0.4986 - val_acc: 0.3253 - lr: 0.0010 Epoch 8/10 11/11 [==============================] - 25s 2s/step - loss: 0.2484 - acc: 0.7896 - val_loss: 0.5496 - val_acc: 0.3494 - lr: 0.0010 Epoch 9/10 11/11 [==============================] - 25s 2s/step - loss: 0.1875 - acc: 0.8476 - val_loss: 0.5942 - val_acc: 0.3253 - lr: 0.0010 Epoch 10/10 11/11 [==============================] - 25s 2s/step - loss: 0.1652 - acc: 0.8689 - val_loss: 0.6204 - val_acc: 0.3253 - lr: 0.0010
_, train_accuracy = base_model.evaluate(X_text_train, y_text_train, batch_size=32, verbose=0)
_, test_accuracy = base_model.evaluate(X_text_test, y_text_test, batch_size=32, verbose=0)
print('Train accuracy: %.2f' % (train_accuracy*100))
print('Test accuracy: %.2f' % (test_accuracy*100))
Train accuracy: 88.72 Test accuracy: 32.53
y_pred = base_model.predict(X_text_test, verbose=0) # Multiclass
y_pred_index = np.argmax(y_pred,axis=1)
y_original = np.argmax(y_text_test, axis =1)
cm = confusion_matrix(y_original,y_pred_index)
cm_label = ['I', 'II', 'III','IV', 'V']
plt.figure(figsize=(12,6))
sns.heatmap(cm, annot=True, cmap='Blues',xticklabels = cm_label, yticklabels = cm_label);
print(classification_report(y_original, y_pred_index))
precision recall f1-score support
0 0.80 0.44 0.57 9
1 0.19 0.32 0.24 19
2 0.34 0.48 0.40 21
3 0.41 0.25 0.31 28
4 0.00 0.00 0.00 6
accuracy 0.33 83
macro avg 0.35 0.30 0.30 83
weighted avg 0.36 0.33 0.32 83
# Build a LSTM Neural Network
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 200, trainable=True)(deep_inputs)
LSTM_Layer_1 = Bidirectional(LSTM(128, return_sequences = True))(embedding_layer)
max_pool_layer_1 = GlobalMaxPool1D()(LSTM_Layer_1)
drop_out_layer_1 = Dropout(0.5, input_shape = (256,))(max_pool_layer_1)
dense_layer_1 = Dense(128, activation = 'relu')(drop_out_layer_1)
drop_out_layer_2 = Dropout(0.5, input_shape = (128,))(dense_layer_1)
dense_layer_2 = Dense(64, activation = 'relu')(drop_out_layer_2)
drop_out_layer_3 = Dropout(0.5, input_shape = (64,))(dense_layer_2)
dense_layer_3 = Dense(32, activation = 'relu')(drop_out_layer_3)
drop_out_layer_4 = Dropout(0.5, input_shape = (32,))(dense_layer_3)
dense_layer_4 = Dense(10, activation = 'relu')(drop_out_layer_4)
drop_out_layer_5 = Dropout(0.5, input_shape = (10,))(dense_layer_4)
dense_layer_5 = Dense(5, activation='softmax')(drop_out_layer_5)
lstm_model = Model(inputs=deep_inputs, outputs=dense_layer_5)
opt = optimizers.Adam(lr=0.001)
#opt = optimizers.SGD(lr=0.0001, momentum=0.9)
lstm_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['acc'])
print(lstm_model.summary())
Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, 200)] 0
embedding_1 (Embedding) (None, 200, 200) 573000
bidirectional_1 (Bidirectio (None, 200, 256) 336896
nal)
global_max_pooling1d (Globa (None, 256) 0
lMaxPooling1D)
dropout (Dropout) (None, 256) 0
dense_1 (Dense) (None, 128) 32896
dropout_1 (Dropout) (None, 128) 0
dense_2 (Dense) (None, 64) 8256
dropout_2 (Dropout) (None, 64) 0
dense_3 (Dense) (None, 32) 2080
dropout_3 (Dropout) (None, 32) 0
dense_4 (Dense) (None, 10) 330
dropout_4 (Dropout) (None, 10) 0
dense_5 (Dense) (None, 5) 55
=================================================================
Total params: 953,513
Trainable params: 953,513
Non-trainable params: 0
_________________________________________________________________
None
#callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3, min_delta=1E-2)
#rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.01, patience=3, min_delta=1E-2)
# metrics = Metrics(validation_data=(X_text_train, np.array(y_aug_train)))
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)
# fit the keras model on the dataset
base_model_history = lstm_model.fit(X_text_train, y_text_train, epochs=10, batch_size=32, verbose=1, validation_data=(X_text_test, y_text_test), callbacks=[callback])
Epoch 1/10 11/11 [==============================] - 8s 209ms/step - loss: 1.6054 - acc: 0.2378 - val_loss: 1.5880 - val_acc: 0.3735 Epoch 2/10 11/11 [==============================] - 1s 99ms/step - loss: 1.5831 - acc: 0.2409 - val_loss: 1.5565 - val_acc: 0.2530 Epoch 3/10 11/11 [==============================] - 1s 95ms/step - loss: 1.5700 - acc: 0.2530 - val_loss: 1.5431 - val_acc: 0.3373 Epoch 4/10 11/11 [==============================] - 1s 93ms/step - loss: 1.5586 - acc: 0.2530 - val_loss: 1.5323 - val_acc: 0.3373 Epoch 5/10 11/11 [==============================] - 1s 92ms/step - loss: 1.5559 - acc: 0.2774 - val_loss: 1.5331 - val_acc: 0.3373 Epoch 6/10 11/11 [==============================] - 1s 91ms/step - loss: 1.5359 - acc: 0.3110 - val_loss: 1.5359 - val_acc: 0.3373 Epoch 7/10 11/11 [==============================] - 1s 91ms/step - loss: 1.5292 - acc: 0.3201 - val_loss: 1.5229 - val_acc: 0.3373 Epoch 8/10 11/11 [==============================] - 1s 91ms/step - loss: 1.5567 - acc: 0.2866 - val_loss: 1.5189 - val_acc: 0.3373 Epoch 9/10 11/11 [==============================] - 1s 90ms/step - loss: 1.5361 - acc: 0.2896 - val_loss: 1.5193 - val_acc: 0.3373 Epoch 10/10 11/11 [==============================] - 1s 92ms/step - loss: 1.5308 - acc: 0.3140 - val_loss: 1.5158 - val_acc: 0.3373
with open('/content/drive/MyDrive/AIML/Capstone/parameters.pickle', "wb") as f:
pickle.dump((le, tokenizer), f)
lstm_model_json = lstm_model.to_json()
with open("LSTM_aug_model.json", "w") as json_file:
json_file.write(lstm_model_json)
# serialize weights to HDF5
lstm_model.save_weights("LSTM_aug_model_weights.h5")
print("Saved model weights to disk")
# Save the model in h5 format
lstm_model.save("LSTM_aug_model.h5")
print("Saved model to disk")
Saved model weights to disk Saved model to disk
# evaluate the keras model
_, train_accuracy = lstm_model.evaluate(X_text_train, y_text_train, batch_size=32, verbose=0)
_, test_accuracy = lstm_model.evaluate(X_text_test, y_text_test, batch_size=32, verbose=0)
print('Train accuracy: %.2f' % (train_accuracy*100))
print('Test accuracy: %.2f' % (test_accuracy*100))
Train accuracy: 33.54 Test accuracy: 33.73
with open('/content/drive/MyDrive/AIML/Capstone/Jan-G4---NLP-Chatbot/Utilities/result_df.csv', "rb") as f:
result_df = pickle.load(f)
y_pred = lstm_model.predict(X_text_test, verbose=0) # Multiclass
y_pred_index = np.argmax(y_pred,axis=1)
y_original = np.argmax(y_text_test, axis =1)
_,tr_accuracy = lstm_model.evaluate(X_text_train,y_text_train)
_,te_accuracy = lstm_model.evaluate(X_text_test,y_text_test)
F1_sre= f1_score(y_original,y_pred_index, average='micro')
tr_accuracy,te_accuracy,F1_sre
11/11 [==============================] - 0s 35ms/step - loss: 1.5074 - acc: 0.3354 3/3 [==============================] - 0s 35ms/step - loss: 1.5158 - acc: 0.3373
(0.3353658616542816, 0.33734938502311707, 0.3373493975903614)
result_df.loc[len(result_df.index)] = ['LSTM_Aug_Model', tr_accuracy, te_accuracy,F1_sre]
result_df
| Model | Train_accuracy | Test_accuracy | F1_score | |
|---|---|---|---|---|
| 0 | LogReg | 0.890244 | 0.890244 | 0.373494 |
| 1 | Naive Bayes | 1.000000 | 1.000000 | 0.421687 |
| 2 | KNN | 0.591463 | 0.591463 | 0.361446 |
| 3 | SVM | 0.990854 | 0.990854 | 0.361446 |
| 4 | Decision Tree | 0.512195 | 0.512195 | 0.313253 |
| 5 | RandomForest | 0.612805 | 0.612805 | 0.373494 |
| 6 | Bagging | 1.000000 | 1.000000 | 0.373494 |
| 7 | AdaBoost | 0.393293 | 0.393293 | 0.349398 |
| 8 | Gradient Boost | 0.920732 | 0.920732 | 0.325301 |
| 9 | XGBoost | 0.951220 | 0.951220 | 0.313253 |
| 10 | ANN_Model | 0.905488 | 0.409639 | 0.409639 |
| 11 | LSTM_Model | 0.326219 | 0.373494 | 0.325301 |
| 12 | LSTM_Aug_Model | 0.333333 | 0.345455 | 0.345455 |
with open('/content/drive/MyDrive/AIML/Capstone/Jan-G4---NLP-Chatbot/result_df.csv', "wb") as f:
pickle.dump((result_df), f)
cm = confusion_matrix(y_original,y_pred_index)
cm_label = ['I', 'II', 'III','IV', 'V']
plt.figure(figsize=(12,6))
sns.heatmap(cm, annot=True, cmap='Blues',xticklabels = cm_label, yticklabels = cm_label);
print(classification_report(y_original, y_pred_index))
precision recall f1-score support
0 0.00 0.00 0.00 9
1 0.00 0.00 0.00 19
2 0.00 0.00 0.00 21
3 0.34 1.00 0.50 28
4 0.00 0.00 0.00 6
accuracy 0.34 83
macro avg 0.07 0.20 0.10 83
weighted avg 0.11 0.34 0.17 83
epochs = range(len(base_model_history.history['loss'])) # Get number of epochs
# plot loss learning curves
plt.plot(epochs, base_model_history.history['loss'], label = 'train')
plt.plot(epochs, base_model_history.history['val_loss'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation loss')
Text(0.5, 1.0, 'Training and validation loss')
# plot accuracy learning curves
epochs = range(len(base_model_history.history['loss'])) # Get number of epochs
plt.plot(epochs, base_model_history.history['acc'], label = 'train')
plt.plot(epochs, base_model_history.history['val_acc'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation accuracy')
Text(0.5, 1.0, 'Training and validation accuracy')
%cd /content/drive/MyDrive/AIML/Capstone
/content/drive/MyDrive/AIML/Capstone
from keras.models import load_model
model = load_model('finalized_LSTM_model.h5')
def model_prediction(usr_txt):
with open('/content/drive/MyDrive/AIML/Capstone/parameters.pickle', "rb") as f:
lab_en, tokenizer_data = pickle.load(f)
from NLP_text_preprocess import PreProcessing
pp = PreProcessing(to_lower = config.to_lower,remove_url=config.remove_url,
remove_time = config.remove_time,expand_contraction =config.expand_contraction,
remove_special_character=config.remove_special_character,
remove_punctuation=config.remove_punctuation,
remove_whitespace=config.remove_whitespace,
keep_alpha_numeric = False,
check_spelling=config.check_spelling,
remove_stopword=False,
lemmatize_word=config.lemmatize_word)
pre_txt = pp.preprocess(usr_txt)
tok_txt = tokenizer_data.texts_to_sequences([pre_txt])
pad_txt = pad_sequences(tok_txt, padding='post', maxlen=185)
prediction = model.predict(pad_txt, verbose=0)
pred_index = np.argmax(prediction,axis=1)
result = lab_en.inverse_transform(pred_index)[0]
return print('The Potential accident level is', result)
model_prediction(industry_df['Description_preprocessed'][1])
The Potential accident level is IV
try:
import nlpaug.augmenter.word as naw
import transformers
except ModuleNotFoundError:
!pip install numpy requests nlpaug
!pip install transformers
import nlpaug.augmenter.word as naw
import transformers
!pip uninstall pandas
!pip install pandas==1.1.5
aug_bert = naw.ContextualWordEmbsAug(
model_path='bert-base-uncased', action="substitute")
industry_df = pd.read_csv("industry_df_with_stopwords.csv")
# Select input and output features
X_text = industry_df['Description_preprocessed']
y_text = industry_df['Potential_Accident_Level']
# Encode labels in column 'Potential Accident Level'and Convert into one-hot encoded vectors:
le = LabelEncoder()
le.fit(y_text)
y_text = le.transform(y_text)
# Divide our data into testing and training sets:
X_text_train, X_text_test, y_text_train, y_text_test = train_test_split(X_text, y_text, test_size = 0.20, random_state = 1, stratify = y_text)
print(np.bincount(y_text_train))
print(np.bincount(y_text_test))
[ 34 76 85 110 23] [ 9 19 21 28 6]
y_text_train = np_utils.to_categorical(y_text_train)
y_text_test = np_utils.to_categorical(y_text_test)
augmented_text = aug_bert.augment(industry_df['Description_preprocessed'][1])
print("Original:")
print(industry_df['Description_preprocessed'][1])
print("Augmented Text:")
print(augmented_text)
Original: during the activation of a sodium sulphide pump the piping wa uncoupled and the sulfide solution wa designed in the area to reach the maid immediately she made use of the emergency shower and wa directed to the ambulatory doctor and later to the hospital note of sulphide solution 48 gram liter Augmented Text: during overnight activation of a sodium sulphide pump the piping wa uncoupled and the sulfide solution wa designed in hospital garden to reach the maid immediately patient made use of the sodium shower and wa directed to this ambulatory doctor and provided by the hospital solution of sulphide is 48 gram liter
len(augmented_text)
310
len(industry_df['Description_preprocessed'][1])
294
aug_roberta = naw.ContextualWordEmbsAug(
model_path='roberta-base', action="substitute")
augmented_text = aug_roberta.augment(industry_df['Description_preprocessed'][1])
print("Original:")
print(industry_df['Description_preprocessed'][1])
print("Augmented Text:")
print(augmented_text)
Original: during the activation of a sodium sulphide pump the piping wa uncoupled and the sulfide solution wa designed in the area to reach the maid immediately she made use of the emergency shower and wa directed to the ambulatory doctor and later to the hospital note of sulphide solution 48 gram liter Augmented Text: during the activation of sodium sodium sulphide gas in piping wa uncoupled hence the sulfide solution wa designed in a area to reach the maid immediately she made use of a emergency shower water wa directed to the ambulatory doctor and later w see hospital note of sulphide solution 48 the liter
X_aug_train = X_text_train.apply(lambda x: aug_roberta.augment(x))
import pickle
pickle.dump(X_aug_train, open('/content/drive/MyDrive/AIML/Capstone/X_aug_train.pickle', 'wb'))
infile = open('/content/drive/MyDrive/AIML/Capstone/X_aug_train.pickle','rb')
X_aug = pickle.load(infile)
X_aug_train =[]
X_aug_train = X_text_train.tolist() + X_aug.tolist()
len(X_aug_train)
656
y_aug_train = y_text_train.tolist() + y_text_train.tolist()
len(y_aug_train)
656
X_aug_train = np.array(X_aug_train)
y_aug_train = np.array(y_aug_train)
# The first step in word embeddings is to convert the words into thier corresponding numeric indexes.
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_aug_train)
X_text_train = tokenizer.texts_to_sequences(X_aug_train)
X_text_test = tokenizer.texts_to_sequences(X_text_test)
vocab_size = len(tokenizer.word_index) + 1
print("vocab_size:", vocab_size)
maxlen = 200
X_text_train = pad_sequences(X_text_train, padding='post', maxlen=maxlen)
X_text_test = pad_sequences(X_text_test, padding='post', maxlen=maxlen)
vocab_size: 3439
y_text_train = y_aug_train
X_text_train.shape,y_text_train.shape,X_text_test.shape, y_text_test.shape
((656, 200), (656, 5), (83, 200), (83, 5))
import pickle
pickle.dump((X_text_train,y_text_train,X_text_test,y_text_test), open('/content/drive/MyDrive/AIML/Capstone/Train_Test_augment.pickle', 'wb'))
import pickle
infile = open('/content/drive/MyDrive/AIML/Capstone/Train_Test_augment.pickle','rb')
X_train,y_train,X_test,y_test = pickle.load(infile)
base_model = Sequential()
base_model.add(Embedding(3439, output_dim=200, input_length=200, trainable=True))
#LSTM
base_model.add(Bidirectional(LSTM(units=128 , recurrent_dropout = 0.5 , dropout = 0.5)))
base_model.add(Dense(5, activation='softmax'))
base_model.compile(optimizer=optimizers.Adam(lr = 0.001), loss='categorical_crossentropy', metrics=['acc'])
WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer lstm will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
base_model.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) (None, 200, 200) 687800
bidirectional (Bidirectiona (None, 256) 336896
l)
dense (Dense) (None, 5) 1285
=================================================================
Total params: 1,025,981
Trainable params: 1,025,981
Non-trainable params: 0
_________________________________________________________________
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.0001, patience=5)
# fit the keras model on the dataset
base_model_history = base_model.fit(X_train,y_train, epochs=20, batch_size=32, verbose=1, validation_data=(X_test,y_test), callbacks=[rlrp, callback])
Epoch 1/20 21/21 [==============================] - 73s 3s/step - loss: 1.5153 - acc: 0.3476 - val_loss: 1.4960 - val_acc: 0.2169 - lr: 0.0010 Epoch 2/20 21/21 [==============================] - 64s 3s/step - loss: 1.4107 - acc: 0.3720 - val_loss: 1.5501 - val_acc: 0.2530 - lr: 0.0010 Epoch 3/20 21/21 [==============================] - 65s 3s/step - loss: 1.2038 - acc: 0.5655 - val_loss: 1.5174 - val_acc: 0.3133 - lr: 0.0010 Epoch 4/20 21/21 [==============================] - 65s 3s/step - loss: 0.9724 - acc: 0.6418 - val_loss: 1.7459 - val_acc: 0.2289 - lr: 0.0010 Epoch 5/20 21/21 [==============================] - 65s 3s/step - loss: 0.9146 - acc: 0.6738 - val_loss: 1.6208 - val_acc: 0.3012 - lr: 0.0010 Epoch 6/20 21/21 [==============================] - 65s 3s/step - loss: 0.5182 - acc: 0.8323 - val_loss: 1.8769 - val_acc: 0.3012 - lr: 0.0010
base_model.save("LSTM_Base_aug_model.h5")
_, train_accuracy = base_model.evaluate(X_train,y_train, batch_size=8, verbose=0)
_, test_accuracy = base_model.evaluate(X_test,y_test, batch_size=8, verbose=0)
print('Train accuracy: %.2f' % (train_accuracy*100))
print('Test accuracy: %.2f' % (test_accuracy*100))
Train accuracy: 94.05 Test accuracy: 30.12
epochs = range(len(base_model_history.history['loss'])) # Get number of epochs
# plot loss learning curves
plt.plot(epochs, base_model_history.history['loss'], label = 'train')
plt.plot(epochs, base_model_history.history['val_loss'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation loss')
Text(0.5, 1.0, 'Training and validation loss')
epochs = range(len(base_model_history.history['loss'])) # Get number of epochs
plt.plot(epochs, base_model_history.history['acc'], label = 'train')
plt.plot(epochs, base_model_history.history['val_acc'], label = 'test')
plt.legend(loc = 'upper right')
plt.title ('Training and validation accuracy')
Text(0.5, 1.0, 'Training and validation accuracy')
y_pred = base_model.predict(X_test, verbose=0) # Multiclass
y_pred_index = np.argmax(y_pred,axis=1)
y_original = np.argmax(y_test, axis =1)
cm = confusion_matrix(y_original,y_pred_index)
cm_label = ['I', 'II', 'III','IV', 'V']
plt.figure(figsize=(12,6))
sns.heatmap(cm, annot=True, cmap='Blues',xticklabels = cm_label, yticklabels = cm_label);
print(classification_report(y_original, y_pred_index))
precision recall f1-score support
0 0.67 0.22 0.33 9
1 0.24 0.26 0.25 19
2 0.29 0.48 0.36 21
3 0.31 0.18 0.23 28
4 0.12 0.17 0.14 6
accuracy 0.28 83
macro avg 0.33 0.26 0.26 83
weighted avg 0.31 0.28 0.27 83
y_predict = base_model.predict(X_test)
fpr = {}
tpr = {}
#thresh ={}
roc_auc = dict()
n_class = 5
for i in range(n_class):
fpr[i], tpr[i],_ = roc_curve(y_test[:,i], y_predict[:,i])
roc_auc[i] = auc(fpr[i], tpr[i])
# plotting
plt.figure(figsize=(15, 6))
plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label="ROC curve (area = %0.2f)" % roc_auc[0])
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label="ROC curve (area = %0.2f)" % roc_auc[1])
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label="ROC curve (area = %0.2f)" % roc_auc[2])
plt.plot(fpr[3], tpr[3], linestyle='--',color='yellow', label= "ROC curve (area = %0.2f)" % roc_auc[3])
plt.plot(fpr[4], tpr[4], linestyle='--',color='red', label="ROC curve (area = %0.2f)" % roc_auc[4])
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc="lower right")
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);
# Build a LSTM Neural Network
deep_inputs = Input(shape=(maxlen,))
embedding_layer = Embedding(vocab_size, 200, trainable=True)(deep_inputs)
LSTM_Layer_1 = Bidirectional(LSTM(units=128 , recurrent_dropout = 0.5 , dropout = 0.5))(embedding_layer)
#max_pool_layer_1 = GlobalMaxPool1D()(LSTM_Layer_1)
drop_out_layer_1 = Dropout(0.5, input_shape = (256,))(LSTM_Layer_1)
dense_layer_1 = Dense(128, activation = 'relu')(drop_out_layer_1)
drop_out_layer_2 = Dropout(0.5, input_shape = (128,))(dense_layer_1)
dense_layer_2 = Dense(64, activation = 'relu')(drop_out_layer_2)
drop_out_layer_3 = Dropout(0.5, input_shape = (64,))(dense_layer_2)
dense_layer_3 = Dense(32, activation = 'relu')(drop_out_layer_3)
drop_out_layer_4 = Dropout(0.5, input_shape = (32,))(dense_layer_3)
dense_layer_4 = Dense(10, activation = 'relu')(drop_out_layer_4)
drop_out_layer_5 = Dropout(0.5, input_shape = (10,))(dense_layer_4)
dense_layer_5 = Dense(5, activation='softmax')(drop_out_layer_5)
lstm_model = Model(inputs=deep_inputs, outputs=dense_layer_5)
opt = optimizers.Adam(lr=0.001)
#opt = optimizers.SGD(lr=0.0001, momentum=0.9)
lstm_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['acc'])
WARNING:tensorflow:Layer lstm_3 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer lstm_3 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU. WARNING:tensorflow:Layer lstm_3 will not use cuDNN kernels since it doesn't meet the criteria. It will use a generic GPU kernel as fallback when running on GPU.
lstm_model.summary()
Model: "model_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_2 (InputLayer) [(None, 200)] 0
embedding_3 (Embedding) (None, 200, 200) 573000
bidirectional_3 (Bidirectio (None, 256) 336896
nal)
dropout_5 (Dropout) (None, 256) 0
dense_7 (Dense) (None, 128) 32896
dropout_6 (Dropout) (None, 128) 0
dense_8 (Dense) (None, 64) 8256
dropout_7 (Dropout) (None, 64) 0
dense_9 (Dense) (None, 32) 2080
dropout_8 (Dropout) (None, 32) 0
dense_10 (Dense) (None, 10) 330
dropout_9 (Dropout) (None, 10) 0
dense_11 (Dense) (None, 5) 55
=================================================================
Total params: 953,513
Trainable params: 953,513
Non-trainable params: 0
_________________________________________________________________
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.0001, patience=5)
training_history = lstm_model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=1, validation_data=(X_test, y_test), callbacks=[rlrp,callback])
Epoch 1/20 21/21 [==============================] - 63s 2s/step - loss: 1.5936 - acc: 0.2744 - val_loss: 1.5563 - val_acc: 0.3373 - lr: 0.0010 Epoch 2/20 21/21 [==============================] - 49s 2s/step - loss: 1.5647 - acc: 0.3262 - val_loss: 1.5360 - val_acc: 0.3373 - lr: 0.0010 Epoch 3/20 21/21 [==============================] - 48s 2s/step - loss: 1.5531 - acc: 0.3064 - val_loss: 1.5308 - val_acc: 0.3373 - lr: 0.0010 Epoch 4/20 21/21 [==============================] - 50s 2s/step - loss: 1.5271 - acc: 0.3034 - val_loss: 1.5132 - val_acc: 0.3373 - lr: 0.0010 Epoch 5/20 21/21 [==============================] - 47s 2s/step - loss: 1.5291 - acc: 0.2973 - val_loss: 1.5009 - val_acc: 0.3373 - lr: 0.0010 Epoch 6/20 21/21 [==============================] - 48s 2s/step - loss: 1.5278 - acc: 0.3018 - val_loss: 1.5088 - val_acc: 0.3373 - lr: 0.0010 Epoch 7/20 21/21 [==============================] - 52s 3s/step - loss: 1.4660 - acc: 0.3323 - val_loss: 1.4627 - val_acc: 0.3373 - lr: 0.0010 Epoch 8/20 21/21 [==============================] - 47s 2s/step - loss: 1.4053 - acc: 0.3186 - val_loss: 1.4499 - val_acc: 0.3373 - lr: 0.0010 Epoch 9/20 21/21 [==============================] - 47s 2s/step - loss: 1.3690 - acc: 0.3537 - val_loss: 1.4397 - val_acc: 0.3373 - lr: 0.0010 Epoch 10/20 21/21 [==============================] - 47s 2s/step - loss: 1.3002 - acc: 0.3613 - val_loss: 1.4534 - val_acc: 0.4096 - lr: 0.0010 Epoch 11/20 21/21 [==============================] - 49s 2s/step - loss: 1.3212 - acc: 0.4009 - val_loss: 1.4833 - val_acc: 0.3735 - lr: 0.0010 Epoch 12/20 21/21 [==============================] - 48s 2s/step - loss: 1.2267 - acc: 0.4284 - val_loss: 1.5642 - val_acc: 0.3855 - lr: 0.0010 Epoch 13/20 21/21 [==============================] - 76s 4s/step - loss: 1.1534 - acc: 0.4893 - val_loss: 1.5718 - val_acc: 0.2892 - lr: 0.0010 Epoch 14/20 21/21 [==============================] - 53s 2s/step - loss: 1.1281 - acc: 0.5015 - val_loss: 1.5289 - val_acc: 0.3253 - lr: 0.0010
# evaluate the keras model
_, train_accuracy = lstm_model.evaluate(X_text_train, y_text_train, batch_size=8, verbose=0)
_, test_accuracy = lstm_model.evaluate(X_text_test, y_text_test, batch_size=8, verbose=0)
print('Train accuracy: %.2f' % (train_accuracy*100))
print('Test accuracy: %.2f' % (test_accuracy*100))
Train accuracy: 28.35 Test accuracy: 21.69
y_pred = lstm_model.predict(X_text_test, verbose=0) # Multiclass
y_pred_index = np.argmax(y_pred,axis=1)
y_original = np.argmax(y_text_test, axis =1)
cm = confusion_matrix(y_original,y_pred_index)
cm_label = ['I', 'II', 'III','IV', 'V']
plt.figure(figsize=(12,6))
sns.heatmap(cm, annot=True, cmap='Blues',xticklabels = cm_label, yticklabels = cm_label);
print(classification_report(y_original, y_pred_index))
precision recall f1-score support
0 0.00 0.00 0.00 9
1 0.21 0.79 0.33 19
2 0.00 0.00 0.00 21
3 0.50 0.11 0.18 28
4 0.00 0.00 0.00 6
accuracy 0.22 83
macro avg 0.14 0.18 0.10 83
weighted avg 0.22 0.22 0.13 83
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super(TransformerBlock, self).__init__()
self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-4)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-4)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
def get_config(self):
config = super().get_config()
config.update({
"att": self.att,
"ffn": self.ffn,
"layernorm1": self.layernorm1,
"layernorm2": self.layernorm2,
"dropout1": self.dropout1,
"dropout2": self.dropout2,
})
return config
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super(TokenAndPositionEmbedding, self).__init__()
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
def get_config(self):
config = super().get_config()
config.update({
"token_emb": self.token_emb,
"pos_emb": self.pos_emb,
})
return config
embed_dim = 300 # Embedding size for each token
num_heads = 5 # Number of attention heads
ff_dim = 200 # Hidden layer size in feed forward network inside transformer
inputs = layers.Input(shape=(200,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(5, activation="softmax")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
print(model.summary())
Model: "model_2"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_3 (InputLayer) [(None, 200)] 0
token_and_position_embeddin (None, 200, 300) 919500
g (TokenAndPositionEmbeddin
g)
transformer_block (Transfor (None, 200, 300) 1926500
merBlock)
global_average_pooling1d (G (None, 300) 0
lobalAveragePooling1D)
dropout_12 (Dropout) (None, 300) 0
dense_14 (Dense) (None, 20) 6020
dropout_13 (Dropout) (None, 20) 0
dense_15 (Dense) (None, 5) 105
=================================================================
Total params: 2,852,125
Trainable params: 2,852,125
Non-trainable params: 0
_________________________________________________________________
None
X_train.shape,y_train.shape,X_test.shape, y_test.shape
((656, 200), (656, 5), (83, 200), (83, 5))
epochs = 50
batch_size = 32
loss = "categorical_crossentropy"
opt = keras.optimizers.Adam(lr=1e-3)
metrics = ["acc"]
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.001, patience=3)
#callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.001)
model.compile(optimizer=opt, loss=loss, metrics=metrics)
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test), callbacks = [rlrp,callback])
Epoch 1/50 21/21 [==============================] - 4s 129ms/step - loss: 1.6719 - acc: 0.2774 - val_loss: 1.6034 - val_acc: 0.3373 - lr: 0.0010 Epoch 2/50 21/21 [==============================] - 2s 104ms/step - loss: 1.5980 - acc: 0.3186 - val_loss: 1.5682 - val_acc: 0.2530 - lr: 0.0010 Epoch 3/50 21/21 [==============================] - 2s 110ms/step - loss: 1.5838 - acc: 0.3018 - val_loss: 1.5880 - val_acc: 0.3494 - lr: 0.0010 Epoch 4/50 21/21 [==============================] - 2s 105ms/step - loss: 1.5839 - acc: 0.3034 - val_loss: 1.5455 - val_acc: 0.2530 - lr: 0.0010 Epoch 5/50 21/21 [==============================] - 2s 105ms/step - loss: 1.5407 - acc: 0.3460 - val_loss: 1.5264 - val_acc: 0.3373 - lr: 0.0010 Epoch 6/50 21/21 [==============================] - 2s 107ms/step - loss: 1.4275 - acc: 0.3780 - val_loss: 1.5558 - val_acc: 0.3373 - lr: 0.0010 Epoch 7/50 21/21 [==============================] - 2s 107ms/step - loss: 1.2956 - acc: 0.4405 - val_loss: 1.7134 - val_acc: 0.3373 - lr: 0.0010 Epoch 8/50 21/21 [==============================] - 2s 106ms/step - loss: 1.1689 - acc: 0.5183 - val_loss: 1.9582 - val_acc: 0.3133 - lr: 0.0010
model.save("Transformer_aug_model.h5")
_, train_accuracy = model.evaluate(X_train, y_train, batch_size=8, verbose=0)
_, test_accuracy = model.evaluate(X_test, y_test, batch_size=8, verbose=0)
print('Train accuracy: %.2f' % (train_accuracy*100))
print('Test accuracy: %.2f' % (test_accuracy*100))
Train accuracy: 54.88 Test accuracy: 31.33
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
#predict the labels
y_predicted_labels = model.predict(X_text_test)
y_pred_index = np.argmax(y_predicted_labels,axis=1)
y_original = np.argmax(y_text_test, axis =1)
#plot the confusion matrix
cm = confusion_matrix(y_original,y_pred_index)
cm_label = ['I', 'II', 'III','IV', 'V']
plt.figure(figsize=(6,6))
sns.heatmap(cm, annot=True, cmap='Blues',xticklabels = cm_label, yticklabels = cm_label,fmt='g');
print(classification_report(y_original, y_pred_index))
precision recall f1-score support
0 0.00 0.00 0.00 9
1 0.00 0.00 0.00 19
2 0.30 0.90 0.45 21
3 0.35 0.25 0.29 28
4 0.00 0.00 0.00 6
accuracy 0.31 83
macro avg 0.13 0.23 0.15 83
weighted avg 0.19 0.31 0.21 83
y_trans_predict = model.predict(X_test)
fpr = {}
tpr = {}
#thresh ={}
roc_auc = dict()
n_class = 5
for i in range(n_class):
fpr[i], tpr[i],_ = roc_curve(y_test[:,i], y_trans_predict[:,i])
roc_auc[i] = auc(fpr[i], tpr[i])
# plotting
plt.figure(figsize=(15, 6))
plt.plot(fpr[0], tpr[0], linestyle='--',color='orange', label="ROC curve (area = %0.2f)" % roc_auc[0])
plt.plot(fpr[1], tpr[1], linestyle='--',color='green', label="ROC curve (area = %0.2f)" % roc_auc[1])
plt.plot(fpr[2], tpr[2], linestyle='--',color='blue', label="ROC curve (area = %0.2f)" % roc_auc[2])
plt.plot(fpr[3], tpr[3], linestyle='--',color='yellow', label= "ROC curve (area = %0.2f)" % roc_auc[3])
plt.plot(fpr[4], tpr[4], linestyle='--',color='red', label="ROC curve (area = %0.2f)" % roc_auc[4])
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc="lower right")
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);
!pip install --upgrade simpletransformers
!pip show simpletransformers
Name: simpletransformers Version: 0.63.3 Summary: An easy-to-use wrapper library for the Transformers library. Home-page: https://github.com/ThilinaRajapakse/simpletransformers/ Author: Thilina Rajapakse Author-email: chaturangarajapakshe@gmail.com License: UNKNOWN Location: /usr/local/lib/python3.7/dist-packages Requires: tensorboard, sentencepiece, scipy, seqeval, transformers, datasets, numpy, streamlit, tqdm, pandas, requests, tokenizers, scikit-learn, regex, wandb Required-by:
industry_df = pd.read_csv("industry_df_with_stopwords.csv")
df = industry_df[['Description_preprocessed','Potential_Accident_Level']]
df['Potential_Accident_Level'] = df['Potential_Accident_Level'].astype('category').cat.codes
# For SimpleTransformers, the 'text' column should be the first column
df.columns = ['text','label'] # we need this for simpletransformers
df.head(2)
| text | label | |
|---|---|---|
| 0 | while removing the drill rod of the jumbo 08 f... | 3 |
| 1 | during the activation of a sodium sulphide pum... | 3 |
df['label'].value_counts()
3 138 2 106 1 95 0 43 4 29 Name: label, dtype: int64
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip. [nltk_data] Downloading package wordnet to /root/nltk_data... [nltk_data] Unzipping corpora/wordnet.zip.
True
from nltk.corpus import wordnet
# Define a function to find synonyms of words in a text.
def get_synonyms(word):
synonyms = set()
for syn in wordnet.synsets(word):
for l in syn.lemmas():
synonym = l.name().replace("_", " ").replace("-", " ").lower()
synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
synonyms.add(synonym)
if word in synonyms:
synonyms.remove(word)
return list(synonyms)
from nltk.corpus import stopwords
# create a list of stopwords(unnecessary words to be removed from the corpus)
stop_words = []
for w in stopwords.words('english'):
stop_words.append(w)
import random
# Once the synonym is generated, we’ll define a function to replace the words in a sentence with their synonyms for a specified number of examples.
def synonym_replacement(words, n):
words = words.split()
new_words = words.copy()
random_word_list = list(set([word for word in words if word not in stop_words]))
random.shuffle(random_word_list)
num_replaced = 0
for random_word in random_word_list:
synonyms = get_synonyms(random_word)
if len(synonyms) >= 1:
synonym = random.choice(list(synonyms))
new_words = [synonym if word == random_word else word for word in new_words]
num_replaced += 1
if num_replaced >= n: #only replace up to n words
break
sentence = ' '.join(new_words)
return sentence
trial_sent = df['text'][6]
print(trial_sent)
the collaborator report that he wa on street 09 holding in his left hand the volumetric balloon when he slipped and when placing his hand on the ground the volumetric balloon ended up breaking caused a small wound in his left hand
for n in range(3):
print(f" Example of Synonym Replacement: {synonym_replacement(trial_sent,n)}")
Example of Synonym Replacement: the collaborator report that he wa on street 09 holding in his left hand the volumetric balloon when he slipped and when post his hand on the ground the volumetric balloon ended up breaking caused a small wound in his left hand Example of Synonym Replacement: the collaborator report that he wa on street 09 holding in his left hand the volumetric balloon when he slipped and when come in his hand on the ground the volumetric balloon ended up breaking caused a small wound in his left hand Example of Synonym Replacement: the collaborator report that he wa on street 09 holding in his left hand the volumetric billow when he slipped and when placing his hand on the ground the volumetric billow ended up breaking caused a small wind up in his left hand
df_aug = df.copy()
augmented_sentences=[]
augmented_sentences_labels=[]
# Generate synthesized data for minority classes (II, III, IV and V) using the synonym_replacement(words, n) function.
for i in df_aug.index:
if df_aug['label'][i]==1: # Accident Level II
for n in range(2):
temps=synonym_replacement(df_aug['text'][i],n)
augmented_sentences.append(temps)
augmented_sentences_labels.append(df_aug['label'][i])
elif df_aug['label'][i]==2: # Accident Level III
for n in range(2):
temps=synonym_replacement(df_aug['text'][i],n)
augmented_sentences.append(temps)
augmented_sentences_labels.append(df_aug['label'][i])
elif df_aug['label'][i]==3: # Accident Level IV
for n in range(1):
temps=synonym_replacement(df_aug['text'][i],n)
augmented_sentences.append(temps)
augmented_sentences_labels.append(df_aug['label'][i])
elif df_aug['label'][i]==4: # Accident Level V
for n in range(9):
temps=synonym_replacement(df_aug['text'][i],n)
augmented_sentences.append(temps)
augmented_sentences_labels.append(df_aug['label'][i])
elif df_aug['label'][i]==0: # Accident Level V
for n in range(5):
temps=synonym_replacement(df_aug['text'][i],n)
augmented_sentences.append(temps)
augmented_sentences_labels.append(df_aug['label'][i])
# creating dataframes for augmented text and their labels
augmented_sentences = pd.DataFrame(augmented_sentences)
augmented_sentences_labels = pd.DataFrame(augmented_sentences_labels)
aug_df = pd.concat([augmented_sentences,augmented_sentences_labels], axis=1, ignore_index=True)
aug_df.rename(columns={0:'text',1:'label'}, inplace = True)
display(aug_df.head(2).append(aug_df.tail(2)))
| text | label | |
|---|---|---|
| 0 | while removing the drill rod of the jumbo 08 f... | 3 |
| 1 | during the activation of a sodium sulphide tic... | 3 |
| 1014 | at when the supporter cleaned the floor of mod... | 1 |
| 1015 | at when the assistant cleaned the floor of mod... | 1 |
df_aug = df_aug.append([aug_df], ignore_index=True)
df.shape, aug_df.shape, df_aug.shape
((411, 2), (1016, 2), (1427, 2))
def vis_countplot(self,x):
plt.style.use("dark_background")
plt.figure(figsize=(10,5))
figure = sns.countplot(x, palette='autumn')
figure.set_xticklabels(figure.get_xticklabels(),rotation=90)
for p in figure.patches:
height = p.get_height()
figure.text(p.get_x()+p.get_width()/2., height + 0.1,height ,ha="center")
vis_countplot(df_aug,df_aug['label'])
first_column = df_aug.pop('text')
# insert column using insert(position,column_name, first_column) function
df_aug.insert(0, 'text', first_column)
df_aug.head(1)
| text | label | |
|---|---|---|
| 0 | while removing the drill rod of the jumbo 08 f... | 3 |
from sklearn.model_selection import train_test_split
target = 'label'
df_train, df_test = train_test_split(df_aug,test_size=0.2,
random_state=55,
stratify=df_aug[target])
print(f"df : {df_aug.shape}")
print(f"\ndf_train : {df_train.shape}")
print(f"\ndf_test : {df_test.shape}")
df_train.head(2)
df : (1427, 2) df_train : (1141, 2) df_test : (286, 2)
| text | label | |
|---|---|---|
| 1112 | employee wa performing drilling activity with ... | 2 |
| 300 | being 0010 am at time when the collaborator wa... | 2 |
df_train.label.value_counts()
2 254 4 232 1 228 3 221 0 206 Name: label, dtype: int64
df_test.label.value_counts()
2 64 4 58 1 57 3 55 0 52 Name: label, dtype: int64
import pickle
pickle.dump((df_train,df_test), open('/content/drive/MyDrive/AIML/Capstone/Transformers_train_test.pickle', 'wb'))
import pickle
infile = open('/content/drive/MyDrive/AIML/Capstone/Transformers_train_test.pickle','rb')
df_train,df_test = pickle.load(infile)
import torch
torch.cuda.is_available()
True
import tensorflow as tf
tf.test.gpu_device_name()
#Standard output is '/device:GPU:0'
'/device:GPU:0'
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
!mkdir distilbert-New
import wandb
wandb.login()
wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc
True
model_type = 'distilbert'
model_name = 'distilbert-base-cased'
train_args = {
"reprocess_input_data": True,
"overwrite_output_dir": True,
"use_cached_eval_features": True,
#early stopping
"early_stopping_delta": 0.01,
"early_stopping_metric": "loss",
"use_early_stopping": True,
"early_stopping_metric_minimize": False,
"early_stopping_patience": 3,
"evaluate_during_training_steps": 500,
# paths
"output_dir": f"/content/drive/MyDrive/AIML/Capstone/Jan-G4---NLP-Chatbot/distilbert-New",
# size
"train_batch_size": 32, #
"max_seq_length": 200, # use small value to avoid OOM
"num_train_epochs": 20,
"wandb_project": "Chatbot_Distillbert-2",
# rates
# "weight_decay": 0,
#"learning_rate": 4e-5,
# "adam_epsilon": 1e-8,
# evaluation
"evaluate_during_training": False,
"evaluate_during_training_steps": 500,
"save_model_every_epoch": False,
"save_eval_checkpoints": False,
"eval_batch_size": 1,
"gradient_accumulation_steps": 1,
}
%%time
model_distilbert = ClassificationModel(model_type, model_name, args=train_args,num_labels=5, use_cuda=True)
Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight'] - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
CPU times: user 1.45 s, sys: 268 ms, total: 1.72 s Wall time: 9.85 s
model_distilbert.train_model(df_train, eval_df=None)
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_distilbert_200_5_2
INFO:simpletransformers.classification.classification_model: Initializing WandB run for training.
./wandb/run-20220101_150837-3p0tth5a/logsINFO:simpletransformers.classification.classification_model: Training of distilbert model complete. Saved to /content/drive/MyDrive/AIML/Capstone/Jan-G4---NLP-Chatbot/distilbert-New.
(720, 0.1782145797131913)
from simpletransformers.classification import ClassificationModel, ClassificationArgs
model_type = 'distilbert'
model_name = 'distilbert-base-cased'
load_model_distilbert = ClassificationModel(model_type, "/content/drive/MyDrive/AIML/Capstone/Jan-G4---NLP-Chatbot/distilbert-New", use_cuda=True,)
import sklearn
result_distilbert, model_outputs_distilbert, wrong_predictions_distilbert = load_model_distilbert.eval_model(df_test,
acc = sklearn.metrics.accuracy_score)
INFO:simpletransformers.classification.classification_utils: Features loaded from cache at cache_dir/cached_dev_distilbert_200_5_2
INFO:simpletransformers.classification.classification_model: Initializing WandB run for evaluation.
| Training loss | █▃▁▁▁▁▁▁▁▁▁▁▁▁ |
| global_step | ▁▂▂▃▃▄▄▅▅▆▆▇▇█ |
| lr | █▇▇▆▆▅▅▄▄▃▃▂▂▁ |
| Training loss | 0.00064 |
| global_step | 700 |
| lr | 0.0 |
./wandb/run-20220101_151313-3aa49aho/logsINFO:simpletransformers.classification.classification_model:{'mcc': 0.9783924226872274, 'acc': 0.9825174825174825, 'eval_loss': 0.163581965686558}
result_distilbert
{'acc': 0.9825174825174825,
'eval_loss': 0.163581965686558,
'mcc': 0.9783924226872274}
model_outputs_distilbert.shape
(286, 5)
result_distilbert_train, model_outputs_distilbert_train, wrong_predictions_distilbert_train = load_model_distilbert.eval_model(df_train,
acc = sklearn.metrics.accuracy_score)
INFO:simpletransformers.classification.classification_utils: Features loaded from cache at cache_dir/cached_dev_distilbert_200_5_2
INFO:simpletransformers.classification.classification_model: Initializing WandB run for evaluation.
./wandb/run-20220101_154003-3ool3vv2/logsINFO:simpletransformers.classification.classification_model:{'mcc': 0.9783924226872274, 'acc': 0.9825174825174825, 'eval_loss': 0.163581965686558}
result_distilbert_train
{'acc': 0.9825174825174825,
'eval_loss': 0.163581965686558,
'mcc': 0.9783924226872274}
df_test
| level_0 | index | text | label | |
|---|---|---|---|---|
| 0 | 0 | 164 | during execution of drilling on the target bol... | 3 |
| 1 | 1 | 1116 | the collaborator completed the misalignment of... | 1 |
| 2 | 2 | 349 | the technician wa doing the magnetometric surv... | 0 |
| 3 | 3 | 1401 | the injured and his collaborator at the time o... | 2 |
| 4 | 4 | 1290 | the industrial cleaning worker cristian wa per... | 1 |
| ... | ... | ... | ... | ... |
| 281 | 281 | 1020 | employee report that he performed an activenes... | 1 |
| 282 | 282 | 803 | in plant while doing work on geo pump reducer ... | 3 |
| 283 | 283 | 605 | the worker carried out the disassembly of 03 s... | 2 |
| 284 | 284 | 1380 | at 0400 i on 051917 the mechanic on duty of se... | 1 |
| 285 | 285 | 1171 | by averaging at the office of amani lilian pre... | 1 |
286 rows × 4 columns
# Displaying a sample prediction
sample_idx = 5
pred = (df_test['text'])
# Make predictions with the model
predictions, raw_outputs = load_model_distilbert.predict(pred)
print(f'Sampled Text: {df_test["text"].iloc[sample_idx]}')
print(f'True Class: {df_test["label"].iloc[sample_idx]}')
print(f'Predicted Class : {predictions[sample_idx]}')
INFO:simpletransformers.classification.classification_utils: Features loaded from cache at cache_dir/cached_dev_distilbert_200_5_2
Sampled Text: the employee who wa hitchhiking on the fuzz 403 truck equipment he crossed the central anterior of the track to catch the distinguish of the bicycle loader with some other operator who wa stopped in the opposite direction upon returning to the truck it wa hit in the branch left by the loader play out that wa traveling along the road and passed the cep403 on the right True Class: 4 Predicted Class : 4
y_preds_distilbert, _, = load_model_distilbert.predict(df_test['text'])
INFO:simpletransformers.classification.classification_utils: Features loaded from cache at cache_dir/cached_dev_distilbert_200_5_2
result = pd.DataFrame({"Predicitons":y_preds_distilbert,"Actual":df_test['label']})
result
| Predicitons | Actual | |
|---|---|---|
| 0 | 3 | 3 |
| 1 | 1 | 1 |
| 2 | 0 | 0 |
| 3 | 2 | 2 |
| 4 | 1 | 1 |
| ... | ... | ... |
| 281 | 1 | 1 |
| 282 | 3 | 3 |
| 283 | 2 | 2 |
| 284 | 1 | 1 |
| 285 | 1 | 1 |
286 rows × 2 columns
# Displaying Confusion Matrix
from sklearn.metrics import confusion_matrix
def plot_cm(y_true, y_pred, title, figsize=(5,5)):
cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
cm_sum = np.sum(cm, axis=1, keepdims=True)
cm_perc = cm / cm_sum.astype(float) * 100
annot = np.empty_like(cm).astype(str)
nrows, ncols = cm.shape
for i in range(nrows):
for j in range(ncols):
c = cm[i, j]
p = cm_perc[i, j]
if i == j:
s = cm_sum[i]
annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
elif c == 0:
annot[i, j] = ''
else:
annot[i, j] = '%.1f%%\n%d' % (p, c)
cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
cm.index.name = 'Actual'
cm.columns.name = 'Predicted'
fig, ax = plt.subplots(figsize=figsize)
plt.title(title)
sns.heatmap(cm, cmap= "YlGnBu", annot=annot, fmt='', ax=ax)
plot_cm(y_preds_distilbert, df_test['label'].values, 'Confusion matrix for DISTILBERT model', figsize=(7,7))
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, precision_score, roc_auc_score
model_performance = pd.DataFrame(columns=['Model', 'Test Accuracy', 'Test Loss','MCC', 'F1', 'Precision', 'Recall'])
model_performance = model_performance.append({'Model':'DistilBert with Data Augmentation',
'Test Accuracy': result_distilbert['acc'],
'Test Loss': result_distilbert['eval_loss'],
'MCC': result_distilbert['mcc'],
'F1' : f1_score(df_test['label'], y_preds_distilbert,average='micro'),
'Precision': precision_score(df_test['label'], y_preds_distilbert,average='micro'),
'Recall': recall_score(df_test['label'], y_preds_distilbert,average='micro'),
}, ignore_index=True)
model_performance
| Model | Test Accuracy | Test Loss | MCC | F1 | Precision | Recall | |
|---|---|---|---|---|---|---|---|
| 0 | DistilBert with Data Augmentation | 0.982517 | 0.163582 | 0.978392 | 0.982517 | 0.982517 | 0.982517 |
sample = [df_test['text'].iloc[10]]
sample
['being approximately in the no 1880 cx781 my kevin helper of jumbo 55 removed the drill rod that wa in the drilling hole instant that break the chain of subjection of the table of the drilling machine sliding down achieving rubbing the index finger of the left hand causing the injury']
pred, _, = load_model_distilbert.predict(sample)
INFO:simpletransformers.classification.classification_utils: Features loaded from cache at cache_dir/cached_dev_distilbert_200_5_2
pred[0]
3
df_test['label'].iloc[10]
3